import jieba
import re

filePath = 'corpus.txt'
fileSegWordDonePath = 'corpusSegDone.txt'

# 将每一行文本依次存放到一个列表
fileTrainRead = []
with open(filePath, encoding='utf-8') as fileTrainRaw:
    for line in fileTrainRaw:
        fileTrainRead.append(line)

# 去除标点符号
fileTrainClean = []
remove_chars = '[·’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
for i in range(len(fileTrainRead)):
    string = re.sub(remove_chars, "", fileTrainRead[i])
    fileTrainClean.append(string)

# 用jieba进行分词
fileTrainSeg = []
file_userDict = 'dict.txt'  # 自定义的词典
jieba.load_userdict(file_userDict)
for i in range(len(fileTrainClean)):
    fileTrainSeg.append([' '.join(jieba.cut(fileTrainClean[i][7:-7], cut_all=False))])  # 7和-7作用是过滤掉<content>标签，可能要根据自己的做出调整
    if i % 100 == 0:  # 每处理100个就打印一次
        print(i)

with open(fileSegWordDonePath, 'wb') as fW:
    for i in range(len(fileTrainSeg)):
        fW.write(fileTrainSeg[i][0].encode('utf-8'))
        fW.write('\n'.encode("utf-8"))


